import pandas as pd

estimate = pd.read_csv("international_migration_flow.csv", na_filter=False)

country_code = pd.read_csv(
    "https://raw.githubusercontent.com/lukes/ISO-3166-Countries-with-Regional-Codes/refs/heads/master/all/all.csv",
    na_filter=False,
)
region = country_code[["alpha-2", "sub-region"]]
region = region.dropna()
region = region[region["alpha-2"] != ""]
region = region[region["sub-region"] != ""]

# map to a large region to avoid confusion
# refer to https://www.nature.com/articles/s41597-022-01271-z
region_mapping = {
    "Australia and New Zealand": "Oceania",
    "Central Asia": "Eastern Europe and Central Asia",
    "Eastern Asia": "Eastern Asia",
    "Eastern Europe": "Eastern Europe and Central Asia",
    "Latin America and the Caribbean": "Latin America and the Caribbean",
    "Melanesia": "Oceania",
    "Micronesia": "Oceania",
    "Northern Africa": "Northern Africa",
    "Northern America": "Northern America",
    "Northern Europe": "Europe",
    "Polynesia": "Oceania",
    "South-eastern Asia": "South-eastern Asia",
    "Southern Asia": "Southern Asia",
    "Southern Europe": "Europe",
    "Sub-Saharan Africa": "Sub-Saharan Africa",
    "Western Asia": "Western Asia",
    "Western Europe": "Europe",
}

region["sub_region_map"] = region.apply(
    lambda x: region_mapping[x["sub-region"]], axis=1
)
region_dict = dict(zip(region["alpha-2"], region["sub_region_map"]))
region_dict["TW"] = "Eastern Asia"


def get_data(y):
    df = (
        estimate[estimate["migration_month"].str.contains(y)]
        .groupby(["country_from", "country_to"], as_index=False)
        .agg({"num_migrants": "sum"})
    )
    # small countries without continents, or maybe not recognized
    df = df[
        (~df.country_from.isin(["XK", "AN", "ZG", "AQ"]))
        & (~df.country_to.isin(["XK", "AN", "ZG", "AQ"]))
    ]
    df["from"] = df.apply(lambda x: region_dict[x["country_from"]], axis=1)
    df["to"] = df.apply(lambda x: region_dict[x["country_to"]], axis=1)
    agg = df.groupby(["from", "to"], as_index=False)["num_migrants"].sum()

    # select regions with at least 1M outflow and inflow migrants
    _df1 = agg[["from", "num_migrants"]]
    _df2 = agg[["to", "num_migrants"]]
    _df2 = _df2.rename(columns={"to": "from"})
    _df = pd.concat([_df1, _df2])
    _df_sum = _df.groupby("from", as_index=False)["num_migrants"].sum()
    regions_selected = list(_df_sum[_df_sum.num_migrants > 1e6]["from"])
    agg_v2 = agg[
        (agg["from"].isin(regions_selected)) & (agg["to"].isin(regions_selected))
    ]
    agg_v2.to_csv(f"migration_flow_subcontinent_map_{y}.csv", index=None)


get_data("2022")
